Updated: Sep 20, 2023 at 21:59:35 PDT.
Working directory: /home/groups/jgrimmer/trump_blocs.
Reads:
Writes:
rm(list = ls())
library(fst)
library(dplyr)
library(data.table)
cps <- fread("data/ipums-cps/cps_00008.csv")
cps[ , source := "CPS"]
cps[ ,
`:=`(
year = as.character(YEAR),
cpsid = CPSIDP,
weight = VOSUPPWT,
statefips = coler::lz_pad(STATEFIP, 2),
citizen = ifelse(YEAR < 1994, 1L, as.numeric(CITIZEN != 5)),
urban = recode(METRO, "1" = "rural", "2" = "urban", "3" = "suburban",
.default = NA_character_),
race = case_when(RACE %in%
c(200, 801, 805:807, 810:811, 814, 816, 818) ~ "black",
RACE == 100 ~ "white",
TRUE ~ "other"),
age = as.integer(AGE),
gender = recode(SEX, `1` = "male", `2` = "female",
.default = NA_character_),
faminc = FAMINC,
# https://cps.ipums.org/cps-action/downloads/extract_files/cps_00003.xml#EDUC
educ = as.numeric(EDUC) %>%
{
case_when(. <= 73 ~ "HS or less",
# Includes associate's degree
. %in% 74:110 ~ "some college",
# Includes 4+ years of college
. %in% c(111:125) ~ "college")
},
region = REGION %>%
{
case_when(. %in% 11:13 ~ "Northeast",
. %in% 21:23 ~ "Midwest",
. %in% 31:34 ~ "South",
. %in% 41:43 ~ "West",
. %in% 91:99 ~ NA_character_)
},
voted = recode(VOTED,
"01" = 0L, "02" = 1L, .default = NA_integer_)
)
] %>%
# filter to CVAP
filter(citizen == 1, age >= 18)
Copied from Will’s CCES code: age groups in 10-year bands, except under 20 is counted w/ 20s
mfloor <- function(x,base){
base * floor(x/base)
}
cps[ , age_bin := mfloor(age, 10)]
cps[age < 20, age_bin := 20]
cps[!is.na(faminc),
# Matching ANES cut points
`:=`(fincome = faminc,
faminc_quin = ntile(faminc, 5) %>%
recode_factor(`1` = "1st",
`2` = "2nd",
`3` = "3rd",
`4` = "4th",
`5` = "5th"
),
faminc_terc =
ntile(faminc, 3) %>%
recode_factor(`1` = "1st", `2` = "2nd", `3` = "3rd")
), by = .(year, source)
]
cps[!is.na(voted) & !is.na(faminc_quin),
stopifnot(.N > 2e4), by = year]
From CPS-IPUMS codebook:
99999999 = N.I.U. (Not in Universe). 1968-1975: -9999997 (Loss of $9999 or more dollars).
cps[HISPAN %in% 100:612 & race %in% c("white", "other"),
race := "hispanic"]
cps[ , hisp_origin :=
case_when(HISPAN %in% 100:109 ~ "mexican",
HISPAN == 200 ~ "puerto rican",
HISPAN == 300 ~ "cuban",
HISPAN %in% 600:612 ~ "other",
# Fewer than 2000 respondents total
# HISPAN == 400 ~ "dominican",
# HISPAN == 500 ~ "salvadoran",
TRUE ~ NA_character_)
]
fips <- fastLink::statefips
setDT(fips, key = "statefips")
fips[ , south := as.numeric(state %in%
c("TN", "VA", "NC", "SC", "FL",
"GA", "AL", "MS", "LA", "AR", "TX",
# Schickler adds these
"OK", "KY"))
]
cps[fips,
c("south", "state") := .(i.south, i.state),
on = "statefips"
]
For some reason, CPS is missing weights before 1976.
cps[is.na(weight) & year < 1976, weight := 1]
keep <- grep("[a-z]", names(cps), value = TRUE)
cps_out <- dplyr::select(cps, all_of(keep))
setDT(cps_out)
skimr::skim(cps_out)
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl`
## provided. Falling back to `character`.
| Name | cps_out |
| Number of rows | 3151844 |
| Number of columns | 21 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| factor | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| source | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
| year | 0 | 1.00 | 4 | 4 | 0 | 23 | 0 |
| cpsid | 0 | 1.00 | 1 | 21 | 0 | 3079423 | 0 |
| statefips | 0 | 1.00 | 2 | 2 | 0 | 51 | 0 |
| urban | 643853 | 0.80 | 5 | 8 | 0 | 3 | 0 |
| race | 0 | 1.00 | 5 | 8 | 0 | 4 | 0 |
| gender | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| educ | 72422 | 0.98 | 7 | 12 | 0 | 3 | 0 |
| region | 0 | 1.00 | 4 | 9 | 0 | 4 | 0 |
| hisp_origin | 2861170 | 0.09 | 5 | 12 | 0 | 4 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| faminc_quin | 425102 | 0.87 | FALSE | 5 | 1st: 545357, 2nd: 545356, 3rd: 545347, 4th: 545344 |
| faminc_terc | 425102 | 0.87 | FALSE | 3 | 1st: 908921, 2nd: 908916, 3rd: 908905 |
| state | 0 | 1.00 | FALSE | 51 | CA: 260400, NY: 173061, TX: 157626, FL: 128725 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| weight | 0 | 1.00 | 1962.56 | 1146.47 | 0 | 1136.19 | 1795.92 | 2711.42 | 32493.72 | ▇▁▁▁▁ |
| citizen | 0 | 1.00 | 0.97 | 0.18 | 0 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| age | 0 | 1.00 | 36.18 | 22.46 | 0 | 17.00 | 34.00 | 53.00 | 99.00 | ▇▇▇▅▁ |
| faminc | 425102 | 0.87 | 672.50 | 231.86 | 100 | 540.00 | 730.00 | 840.00 | 999.00 | ▂▂▃▅▇ |
| voted | 1143378 | 0.64 | 0.62 | 0.49 | 0 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| age_bin | 0 | 1.00 | 36.03 | 17.90 | 20 | 20.00 | 30.00 | 50.00 | 90.00 | ▇▂▃▁▁ |
| fincome | 425102 | 0.87 | 672.50 | 231.86 | 100 | 540.00 | 730.00 | 840.00 | 999.00 | ▂▂▃▅▇ |
| south | 0 | 1.00 | 0.26 | 0.44 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
write_fst(cps_out, "data/cps_clean.fst")